diff options
| author | 张智皓 <[email protected]> | 2023-08-04 03:19:20 +0000 |
|---|---|---|
| committer | 张智皓 <[email protected]> | 2023-08-04 03:19:20 +0000 |
| commit | 1465a15b8c1ec5bae6e80104a14bcbab8ab05409 (patch) | |
| tree | 7933dda57a75dd0c41370e69bf0e029de840f04d | |
| parent | b5e9713161553dcdac9b8432abc52ca37060f0bb (diff) | |
上传新文件
| -rw-r--r-- | userlib/src/siw_uverbs.c | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/userlib/src/siw_uverbs.c b/userlib/src/siw_uverbs.c new file mode 100644 index 0000000..f3fbee9 --- /dev/null +++ b/userlib/src/siw_uverbs.c @@ -0,0 +1,493 @@ +/* + * Software iWARP library for Linux + * + * Authors: Bernard Metzler <[email protected]> + * + * Copyright (c) 2008-2016, IBM Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of IBM nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if HAVE_CONFIG_H +# include <config.h> +#endif /* HAVE_CONFIG_H */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <pthread.h> + +#include <siw_user.h> +#include "siw.h" +#include "siw_abi.h" + + +#define _load_shared(a) (*(volatile typeof(a) *)&(a)) +#define _store_shared(a, b) do { \ + _load_shared(a) = (b); wmb(); \ + } while (0) + +extern const int siw_debug; +extern int rdma_db_nr; + +int siw_notify_cq(struct ibv_cq *ibcq, int solicited) +{ + struct siw_cq *cq = cq_ofa2siw(ibcq); + int rv = 0; + + if (cq->ctrl) { + if (solicited) + _store_shared(cq->ctrl->notify, SIW_NOTIFY_SOLICITED); + else + _store_shared(cq->ctrl->notify, SIW_NOTIFY_SOLICITED | + SIW_NOTIFY_NEXT_COMPLETION); + + } else { + pthread_spin_lock(&cq->lock); + rv = ibv_cmd_req_notify_cq(ibcq, solicited); + pthread_spin_unlock(&cq->lock); + } + return rv; +} + + +int siw_post_send_ofed(struct ibv_qp *ofa_qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct siw_qp *qp = qp_ofa2siw(ofa_qp); + int rv; + + pthread_spin_lock(&qp->sq_lock); + rv = ibv_cmd_post_send(ofa_qp, wr, bad_wr); + pthread_spin_unlock(&qp->sq_lock); + + return rv; +} + +int siw_post_recv_ofed(struct ibv_qp *ofa_qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct siw_qp *qp = qp_ofa2siw(ofa_qp); + int rv; + + pthread_spin_lock(&qp->rq_lock); + rv = ibv_cmd_post_recv(ofa_qp, wr, bad_wr); + pthread_spin_unlock(&qp->rq_lock); + + return rv; +} + +int siw_post_srq_recv_ofed(struct ibv_srq *ofa_srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct siw_srq *srq = srq_ofa2siw(ofa_srq); + int rv; + + pthread_spin_lock(&srq->lock); + rv = ibv_cmd_post_srq_recv(ofa_srq, wr, bad_wr); + pthread_spin_unlock(&srq->lock); + + return rv; +} + +int siw_poll_cq_ofed(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) +{ + struct siw_cq *cq = cq_ofa2siw(ibcq); + int rv; + + pthread_spin_lock(&cq->lock); + rv = ibv_cmd_poll_cq(ibcq, num_entries, wc); + pthread_spin_unlock(&cq->lock); + + return rv; +} + +static enum siw_opcode map_send_opcode(enum ibv_wr_opcode ibv_op) +{ + switch (ibv_op) { + + case IBV_WR_SEND: return SIW_OP_SEND; + case IBV_WR_RDMA_WRITE: return SIW_OP_WRITE; + case IBV_WR_RDMA_READ: return SIW_OP_READ; + default: + printf("op %d not supported\n", ibv_op); + } + return SIW_NUM_OPCODES + 1; +} + +static inline uint16_t map_send_flags(int ibv_flags) +{ + uint16_t flags = SIW_WQE_VALID; + + if (ibv_flags & IBV_SEND_SIGNALED) flags |= SIW_WQE_SIGNALLED; + if (ibv_flags & IBV_SEND_SOLICITED) flags |= SIW_WQE_SOLICITED; + if (ibv_flags & IBV_SEND_INLINE) flags |= SIW_WQE_INLINE; + if (ibv_flags & IBV_SEND_FENCE) flags |= SIW_WQE_READ_FENCE; + + return flags; +} + +static inline int push_send_wqe(struct ibv_send_wr *ofa_wr, + struct siw_sqe *siw_sqe, int sig_all) +{ + uint32_t flags = map_send_flags(ofa_wr->send_flags); + + siw_sqe->id = ofa_wr->wr_id; + siw_sqe->num_sge = ofa_wr->num_sge; + siw_sqe->raddr = ofa_wr->wr.rdma.remote_addr; + siw_sqe->rkey = ofa_wr->wr.rdma.rkey; + + siw_sqe->opcode = map_send_opcode(ofa_wr->opcode); + + if (sig_all) + flags |= SIW_WQE_SIGNALLED; + + if (flags & SIW_WQE_INLINE) { + char *db = (char *)&siw_sqe->sge[1]; + int bytes = 0, i = 0; + + if (ofa_wr->num_sge > SIW_MAX_SGE) { + if (siw_debug) + printf("too many SGEs: %d\n", ofa_wr->num_sge); + return -EINVAL; + } + while (i < ofa_wr->num_sge) { + + bytes += ofa_wr->sg_list[i].length; + if (bytes > (int)SIW_MAX_INLINE) { + if (siw_debug) + printf("inline data to long: %d:%d\n", + bytes, (int)SIW_MAX_INLINE); + return EINVAL; + } + memcpy(db, (void *)ofa_wr->sg_list[i].addr, + ofa_wr->sg_list[i].length); + db += ofa_wr->sg_list[i++].length; + } + siw_sqe->sge[0].length = bytes; + + } else if (ofa_wr->num_sge == 1) { + siw_sqe->sge[0].laddr = ofa_wr->sg_list[0].addr; + siw_sqe->sge[0].length = ofa_wr->sg_list[0].length; + siw_sqe->sge[0].lkey = ofa_wr->sg_list[0].lkey; + } else if (ofa_wr->num_sge && ofa_wr->num_sge <= SIW_MAX_SGE) + /* this assumes same layout of siw and ofa SGE */ + memcpy(siw_sqe->sge, ofa_wr->sg_list, + siw_sqe->num_sge * sizeof(struct ibv_sge)); + else + return 1; + + /* TODO: handle inline data */ + + if (siw_debug) + printf("push SQ len %u, id %lx, op %d, num_sge %d, addr %lx\n", + siw_sqe->sge[0].length, siw_sqe->id, siw_sqe->opcode, + siw_sqe->num_sge, siw_sqe->sge[0].laddr); + + _store_shared(siw_sqe->flags, flags); + + return 0; +} + +static int siw_db_ofa(struct ibv_qp *ofa_qp) +{ + struct ibv_post_send req; + struct ibv_post_send_resp resp; + int rv; + + req.command = IB_USER_VERBS_CMD_POST_SEND; + req.in_words = (sizeof req) / 4; + req.out_words = (sizeof resp) / 4; + req.response = (uintptr_t)&resp; + req.qp_handle = ofa_qp->handle; + req.wr_count = 0; + req.sge_count = 0; + req.wqe_size = sizeof(struct ibv_send_wr); + + rv = write(ofa_qp->context->cmd_fd, &req, sizeof req); + if (rv == sizeof req) + rv = 0; + else + perror("write: "); + + return rv; +} + +int siw_post_send_mapped(struct ibv_qp *ofa_qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr) +{ + struct siw_qp *qp = qp_ofa2siw(ofa_qp); + uint32_t sq_put; + int rv = 0; + + pthread_spin_lock(&qp->sq_lock); + + *bad_wr = NULL; + sq_put = qp->sq_put; + + /* + * push all work requests into mapped SQ and ring DB + * via empty OFA call + */ + while (wr) { + int idx = sq_put % qp->num_sqe; + struct siw_sqe *sqe = &qp->sendq[idx]; + uint16_t sqe_flags = _load_shared(sqe->flags); + + rmb(); + + if (!(sqe_flags & SIW_WQE_VALID)) { + if (push_send_wqe(wr, sqe, qp->sq_sig_all)) { + rv = -ENOMEM; + *bad_wr = wr; + break; + } + } else { + if (siw_debug) + printf("QP[%d]: SQ overflow, idx %d\n", + qp->id, idx); + rv = -ENOMEM; + *bad_wr = wr; + break; + } + sq_put++; + wr = wr->next; + } + if (sq_put != qp->sq_put) { + if (rdma_db_nr > 0) + rv = syscall(rdma_db_nr, SIW_DB_SQ, + qp->dev_id, qp->id); + else + rv = siw_db_ofa(ofa_qp); + if (rv) + *bad_wr = wr; + + qp->sq_put = sq_put; + } + pthread_spin_unlock(&qp->sq_lock); + + return rv; +} + + +static inline int push_recv_wqe(struct ibv_recv_wr *ofa_wr, + struct siw_rqe *siw_rqe) +{ + siw_rqe->id = ofa_wr->wr_id; + siw_rqe->num_sge = ofa_wr->num_sge; + + if (ofa_wr->num_sge == 1) { + siw_rqe->sge[0].laddr = ofa_wr->sg_list[0].addr; + siw_rqe->sge[0].length = ofa_wr->sg_list[0].length; + siw_rqe->sge[0].lkey = ofa_wr->sg_list[0].lkey; + } else if (ofa_wr->num_sge && ofa_wr->num_sge <= SIW_MAX_SGE) + /* this assumes same layout of siw and ofa SGE */ + memcpy(siw_rqe->sge, ofa_wr->sg_list, + sizeof(struct ibv_sge) * ofa_wr->num_sge); + else + return 1; + + if (siw_debug) + printf("push RQ len %u, id %lx, num_sge %d\n", + siw_rqe->sge[0].length, siw_rqe->id, siw_rqe->num_sge); + + _store_shared(siw_rqe->flags, SIW_WQE_VALID); + + return 0; +} + +int siw_post_recv_mapped(struct ibv_qp *ofa_qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct siw_qp *qp = qp_ofa2siw(ofa_qp); + uint32_t rq_put; + int rv = 0; + + pthread_spin_lock(&qp->rq_lock); + + rq_put = qp->rq_put; + + while (wr) { + int idx = rq_put % qp->num_rqe; + struct siw_rqe *rqe = &qp->recvq[idx]; + uint32_t rqe_flags = _load_shared(rqe->flags); + + rmb(); + + if (!(rqe_flags & SIW_WQE_VALID)) { + if (push_recv_wqe(wr, rqe)) { + *bad_wr = wr; + rv = -EINVAL; + break; + } + } else { + if (siw_debug) + printf("QP[%d]: RQ overflow, idx %d\n", + qp->id, idx); + rv = -ENOMEM; + *bad_wr = wr; + break; + } + rq_put++; + wr = wr->next; + } + qp->rq_put = rq_put; + + pthread_spin_unlock(&qp->rq_lock); + + return rv; +} + +int siw_post_srq_recv_mapped(struct ibv_srq *ofa_srq, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr) +{ + struct siw_srq *srq = srq_ofa2siw(ofa_srq); + uint32_t srq_put; + int rv = 0; + + pthread_spin_lock(&srq->lock); + + srq_put = srq->rq_put; + + while (wr) { + int idx = srq_put % srq->num_rqe; + struct siw_rqe *rqe = &srq->recvq[idx]; + uint32_t rqe_flags = _load_shared(rqe->flags); + + rmb(); + + if (!(rqe_flags & SIW_WQE_VALID)) { + if (push_recv_wqe(wr, rqe)) { + *bad_wr = wr; + rv = -EINVAL; + break; + } + } else { + if (siw_debug) + printf("SRQ[%p]: SRQ overflow\n", srq); + rv = -ENOMEM; + *bad_wr = wr; + break; + } + srq_put++; + wr = wr->next; + + } + srq->rq_put = srq_put; + + pthread_spin_unlock(&srq->lock); + + return rv; +} + + +static struct { + enum siw_opcode siw; + enum ibv_wc_opcode ofa; +} map_cqe_opcode [SIW_NUM_OPCODES] = { + {SIW_OP_WRITE, IBV_WC_RDMA_WRITE}, + {SIW_OP_READ, IBV_WC_RDMA_READ}, + {SIW_OP_SEND, IBV_WC_SEND}, + {SIW_OP_SEND_WITH_IMM, -1}, + /* Unsupported */ + {SIW_OP_FETCH_AND_ADD, IBV_WC_FETCH_ADD}, + {SIW_OP_COMP_AND_SWAP, IBV_WC_COMP_SWAP}, + {SIW_OP_INVAL_STAG, -1}, + {SIW_OP_RECEIVE, IBV_WC_RECV} +}; + +static struct { + enum siw_opcode siw; + enum ibv_wc_opcode ofa; +} map_cqe_status [SIW_NUM_WC_STATUS] = { + {SIW_WC_SUCCESS, IBV_WC_SUCCESS}, + {SIW_WC_LOC_LEN_ERR, IBV_WC_LOC_LEN_ERR}, + {SIW_WC_LOC_PROT_ERR, IBV_WC_LOC_PROT_ERR}, + {SIW_WC_LOC_QP_OP_ERR, IBV_WC_LOC_QP_OP_ERR}, + {SIW_WC_WR_FLUSH_ERR, IBV_WC_WR_FLUSH_ERR}, + {SIW_WC_BAD_RESP_ERR, IBV_WC_BAD_RESP_ERR}, + {SIW_WC_LOC_ACCESS_ERR, IBV_WC_LOC_ACCESS_ERR}, + {SIW_WC_REM_ACCESS_ERR, IBV_WC_REM_ACCESS_ERR}, + {SIW_WC_GENERAL_ERR, IBV_WC_GENERAL_ERR} +}; + +static inline void copy_cqe(struct siw_cqe *cqe, struct ibv_wc *wc) +{ + if (siw_debug) + printf("report CQE len %u, id %lx, op %d, status %d, QP %u\n", + cqe->bytes, cqe->id, cqe->opcode, cqe->status, + (uint32_t)cqe->qp_id); + + wc->wr_id = cqe->id; + wc->byte_len = cqe->bytes; + + /* No immediate data supported yet */ + wc->wc_flags = 0; + wc->imm_data = 0; + + wc->vendor_err = 0; + wc->opcode = map_cqe_opcode[cqe->opcode].ofa; + wc->status = map_cqe_status[cqe->status].ofa; + wc->qp_num = (uint32_t)cqe->qp_id; + + wmb(); + _store_shared(cqe->flags, 0); +} + +int siw_poll_cq_mapped(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) +{ + struct siw_cq *cq = cq_ofa2siw(ibcq); + int new = 0; + + + for (; num_entries--; wc++) { + struct siw_cqe *cqe; + + pthread_spin_lock(&cq->lock); + + cqe = &cq->queue[cq->cq_get % cq->num_cqe]; + + if (_load_shared(cqe->flags) & SIW_WQE_VALID) { + copy_cqe(cqe, wc); + ++cq->cq_get; + pthread_spin_unlock(&cq->lock); + } else { + pthread_spin_unlock(&cq->lock); + break; + } + new++; + } + return new; +} + |
