lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [day] [month] [year] [list]
Message-Id: <1308228159-22737-1-git-send-email-bmt@zurich.ibm.com>
Date:	Thu, 16 Jun 2011 14:42:39 +0200
From:	Bernard Metzler <bmt@...ich.ibm.com>
To:	netdev@...r.kernel.org
Cc:	linux-rdma@...r.kernel.org, Bernard Metzler <bmt@...ich.ibm.com>
Subject: [PATCH 11/14] SIWv2: Receive path: siw_qp_rx.c

---
 drivers/infiniband/hw/siw/siw_qp_rx.c | 1557 +++++++++++++++++++++++++++++++++
 1 files changed, 1557 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_qp_rx.c

diff --git a/drivers/infiniband/hw/siw/siw_qp_rx.c b/drivers/infiniband/hw/siw/siw_qp_rx.c
new file mode 100644
index 0000000..2e9045c
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_qp_rx.c
@@ -0,0 +1,1557 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@...ich.ibm.com>
+ *          Fredy Neeser <nfd@...ich.ibm.com>
+ *
+ * Copyright (c) 2008-2011, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+
+/*
+ * ----------------------------
+ * DDP reassembly for Softiwarp
+ * ----------------------------
+ * For the ordering of transmitted DDP segments, the relevant iWARP ordering
+ * rules are as follows:
+ *
+ * - RDMAP (RFC 5040): Section 7.5, Rule 17:
+ *   "RDMA Read Response Message processing at the Remote Peer (reading
+ *    the specified Tagged Buffer) MUST be started only after the RDMA
+ *    Read Request Message has been Delivered by the DDP layer (thus,
+ *    all previous RDMA Messages have been properly submitted for
+ *    ordered Placement)."
+ *
+ * - DDP (RFC 5041): Section 5.3:
+ *   "At the Data Source, DDP:
+ *    o MUST transmit DDP Messages in the order they were submitted to
+ *      the DDP layer,
+ *    o SHOULD transmit DDP Segments within a DDP Message in increasing
+ *      MO order for Untagged DDP Messages, and in increasing TO order
+ *      for Tagged DDP Messages."
+ *
+ * Combining these rules implies that, although RDMAP does not provide
+ * ordering between operations that are generated from the two ends of an
+ * RDMAP stream, DDP *must not* transmit an RDMA Read Response Message before
+ * it has finished transmitting SQ operations that were already submitted
+ * to the DDP layer. It follows that an iWARP transmitter must fully
+ * serialize RDMAP messages belonging to the same QP.
+ *
+ * Given that a TCP socket receives DDP segments in peer transmit order,
+ * we obtain the following ordering of received DDP segments:
+ *
+ * (i)  the received DDP segments of RDMAP messages for the same QP
+ *      cannot be interleaved
+ * (ii) the received DDP segments of a single RDMAP message *should*
+ *      arrive in order.
+ *
+ * The Softiwarp transmitter obeys rule #2 in DDP Section 5.3.
+ * With this property, the "should" becomes a "must" in (ii) above,
+ * which simplifies DDP reassembly considerably.
+ * The Softiwarp receiver currently relies on this property
+ * and reports an error if DDP segments of the same RDMAP message
+ * do not arrive in sequence.
+ */
+
+static inline int siw_crc_rxhdr(struct siw_iwarp_rx *ctx)
+{
+	crypto_hash_init(&ctx->mpa_crc_hd);
+
+	return siw_crc_array(&ctx->mpa_crc_hd, (u8 *)&ctx->hdr,
+			     ctx->fpdu_part_rcvd);
+}
+
+
+/*
+ * siw_rx_umem_init()
+ *
+ * Given memory region @mr and tagged offset @t_off within @mr,
+ * resolve corresponding ib_umem_chunk memory chunk pointer
+ * and update receive context variables to point at receive position.
+ * returns 0 on sucess and failure otherwise.
+ *
+ * NOTE: This function expects virtual addresses.
+ * TODO: Function needs generalization to support relative adressing
+ *       aka "ZBVA".
+ *
+ * @rctx:	Receive Context to be updated
+ * @mr:		Memory Region
+ * @t_off:	Offset within Memory Region
+ *
+ */
+static int siw_rx_umem_init(struct siw_iwarp_rx *rctx, struct siw_mr *mr,
+			    u64 t_off)
+{
+	struct ib_umem_chunk	*chunk;
+	u64			off_mr;   /* offset into MR */
+	int			psge_idx; /* Index of PSGE */
+
+	off_mr = t_off - (mr->mem.va & PAGE_MASK);
+	/*
+	 * Equivalent to
+	 * off_mr = t_off - mr->mem.va;
+	 * off_mr += mr->umem->offset;
+	 */
+
+	/* Skip pages not referenced by t_off */
+	psge_idx = off_mr >> PAGE_SHIFT;
+
+	list_for_each_entry(chunk, &mr->umem->chunk_list, list) {
+		if (psge_idx < chunk->nents)
+			break;
+		psge_idx -= chunk->nents;
+	}
+	if (psge_idx >= chunk->nents) {
+		dprint(DBG_MM|DBG_ON, "(QP%d): Short chunk list\n",
+			RX_QPID(rctx));
+		return -EINVAL;
+	}
+	rctx->pg_idx = psge_idx;
+	rctx->pg_off = off_mr & ~PAGE_MASK;
+	rctx->umem_chunk = chunk;
+
+	dprint(DBG_MM, "(QP%d): New chunk, idx %d\n", RX_QPID(rctx), psge_idx);
+	return 0;
+}
+
+
+/*
+ * siw_rx_umem()
+ *
+ * Receive data of @len into target referenced by @rctx.
+ * This function does not check if umem is within bounds requested by
+ * @len and @t_off. @umem_ends indicates if routine should
+ * not update chunk position pointers after the point it is
+ * currently receiving
+ *
+ * @rctx:	Receive Context
+ * @len:	Number of bytes to place
+ * @umen_ends:	1, if rctx chunk pointer should not be updated after len.
+ */
+static int siw_rx_umem(struct siw_iwarp_rx *rctx, int len, int umem_ends)
+{
+	struct scatterlist	*p_list;
+	void			*dest;
+	struct ib_umem_chunk    *chunk = rctx->umem_chunk;
+	int			pg_off = rctx->pg_off,
+				copied = 0,
+				bytes,
+				rv;
+
+	while (len) {
+		bytes  = min(len, (int)PAGE_SIZE - pg_off);
+		p_list = &chunk->page_list[rctx->pg_idx];
+
+		dest = kmap_atomic(sg_page(p_list), KM_SOFTIRQ0);
+
+		rv = skb_copy_bits(rctx->skb, rctx->skb_offset, dest + pg_off,
+				   bytes);
+
+		dprint(DBG_RX, "(QP%d): Page #%d, "
+			"bytes=%u, rv=%d returned by skb_copy_bits()\n",
+			RX_QPID(rctx), rctx->pg_idx, bytes, rv);
+
+		if (likely(!rv)) {
+			if (rctx->crc_enabled)
+				rv = siw_crc_sg(&rctx->mpa_crc_hd, p_list,
+						pg_off, bytes);
+
+			rctx->skb_offset += bytes;
+			copied += bytes;
+			len -= bytes;
+			pg_off += bytes;
+		}
+
+		kunmap_atomic(dest, KM_SOFTIRQ0);
+
+		if (unlikely(rv)) {
+			rctx->skb_copied += copied;
+			rctx->skb_new -= copied;
+			copied = -EFAULT;
+
+			dprint(DBG_RX|DBG_ON, "(QP%d): failed with %d\n",
+				RX_QPID(rctx), rv);
+
+			goto out;
+		}
+		if (pg_off == PAGE_SIZE) {
+			/*
+			 * end of page
+			 */
+			pg_off = 0;
+			/*
+			 * reference next page chunk if
+			 * - all pages in chunk used AND
+			 * - current loop fills more into this umem
+			 *   OR the next receive will go into this umem
+			 *   starting at the position where we are leaving
+			 *   the routine.
+			 */
+			if (++rctx->pg_idx == chunk->nents &&
+				(len > 0 || !umem_ends)) {
+
+				rctx->pg_idx = 0;
+				chunk = mem_chunk_next(chunk);
+			}
+		}
+	}
+	/*
+	 * store chunk position for resume
+	 */
+	rctx->umem_chunk = chunk;
+	rctx->pg_off = pg_off;
+
+	rctx->skb_copied += copied;
+	rctx->skb_new -= copied;
+out:
+	return copied;
+}
+
+static inline int siw_rx_kva(struct siw_iwarp_rx *rctx, int len, void *kva)
+{
+	int rv = skb_copy_bits(rctx->skb, rctx->skb_offset, kva, len);
+
+	if (likely(!rv)) {
+		rctx->skb_offset += len;
+		rctx->skb_copied += len;
+		rctx->skb_new -= len;
+		if (rctx->crc_enabled) {
+			rv = siw_crc_array(&rctx->mpa_crc_hd, kva, len);
+			if (rv)
+				goto done;
+		}
+		rv = len;
+	}
+done:
+	return rv;
+}
+
+/*
+ * siw_rresp_check_ntoh()
+ *
+ * Check incoming RRESP fragment header against expected
+ * header values and update expected values for potential next
+ * fragment.
+ *
+ * NOTE: This function must be called only if a RRESP DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_rresp_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_rdma_rresp	*rresp = &rctx->hdr.rresp;
+	struct siw_wqe		*wqe = rctx->dest.wqe;
+
+	rresp->sink_stag = be32_to_cpu(rresp->sink_stag);
+	rresp->sink_to   = be64_to_cpu(rresp->sink_to);
+
+	if (rctx->first_ddp_seg) {
+		rctx->ddp_stag = wqe->wr.rread.sge[0].lkey;
+		rctx->ddp_to   = wqe->wr.rread.sge[0].addr;
+	}
+	if (rctx->ddp_stag != rresp->sink_stag) {
+		dprint(DBG_RX|DBG_ON,
+			" received STAG=%08x, expected STAG=%08x\n",
+			rresp->sink_stag, rctx->ddp_stag);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->ddp_to != rresp->sink_to) {
+		dprint(DBG_RX|DBG_ON,
+			" received TO=%016llx, expected TO=%016llx\n",
+			(unsigned long long)rresp->sink_to,
+			(unsigned long long)rctx->ddp_to);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->more_ddp_segs)
+		rctx->ddp_to += rctx->fpdu_part_rem;
+
+	else if (wqe->processed + rctx->fpdu_part_rem != wqe->bytes) {
+		dprint(DBG_RX|DBG_ON,
+			" RRESP length does not match RREQ, "
+			"peer sent=%d, expected %d\n",
+			wqe->processed + rctx->fpdu_part_rem, wqe->bytes);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * siw_write_check_ntoh()
+ *
+ * Check incoming WRITE fragment header against expected
+ * header values and update expected values for potential next
+ * fragment
+ *
+ * NOTE: This function must be called only if a WRITE DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_write_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_rdma_write	*write = &rctx->hdr.rwrite;
+
+	write->sink_stag = be32_to_cpu(write->sink_stag);
+	write->sink_to   = be64_to_cpu(write->sink_to);
+
+	if (rctx->first_ddp_seg) {
+		rctx->ddp_stag = write->sink_stag;
+		rctx->ddp_to   = write->sink_to;
+	} else {
+		if (rctx->ddp_stag != write->sink_stag) {
+			dprint(DBG_RX|DBG_ON,
+				" received STAG=%08x, expected STAG=%08x\n",
+				write->sink_stag, rctx->ddp_stag);
+			/*
+			 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+			 */
+			return -EINVAL;
+		}
+		if (rctx->ddp_to !=  write->sink_to) {
+			dprint(DBG_RX|DBG_ON,
+				" received TO=%016llx, expected TO=%016llx\n",
+				(unsigned long long)write->sink_to,
+				(unsigned long long)rctx->ddp_to);
+			/*
+			 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+			 */
+			return -EINVAL;
+		}
+	}
+	/*
+	 * Update expected target offset for next incoming DDP segment
+	 */
+	if (rctx->more_ddp_segs != 0)
+		rctx->ddp_to += rctx->fpdu_part_rem;
+
+	return 0;
+}
+
+/*
+ * siw_send_check_ntoh()
+ *
+ * Check incoming SEND fragment header against expected
+ * header values and update expected MSN if no next
+ * fragment expected
+ *
+ * NOTE: This function must be called only if a SEND DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_send_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_send	*send = &rctx->hdr.send;
+	struct siw_wqe		*wqe = rctx->dest.wqe;
+
+	send->ddp_msn = be32_to_cpu(send->ddp_msn);
+	send->ddp_mo  = be32_to_cpu(send->ddp_mo);
+	send->ddp_qn  = be32_to_cpu(send->ddp_qn);
+
+	if (send->ddp_qn != RDMAP_UNTAGGED_QN_SEND) {
+		dprint(DBG_RX|DBG_ON, " Invalid DDP QN %d for SEND\n",
+			send->ddp_qn);
+		return -EINVAL;
+	}
+	if (send->ddp_msn != rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]) {
+		dprint(DBG_RX|DBG_ON, " received MSN=%d, expected MSN=%d\n",
+			rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND], send->ddp_msn);
+		/*
+		 * TODO: Error handling
+		 * async_event= RI_EVENT_QP_RQ_PROTECTION_ERROR_MSN_GAP;
+		 * cmpl_status= RI_WC_STATUS_LOCAL_QP_CATASTROPHIC;
+		 */
+		return -EINVAL;
+	}
+	if (send->ddp_mo != wqe->processed) {
+		dprint(DBG_RX|DBG_ON, " Received MO=%u, expected MO=%u\n",
+			send->ddp_mo, wqe->processed);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->first_ddp_seg) {
+		/* initialize user memory write position */
+		rctx->sge_idx = 0;
+		rctx->sge_off = 0;
+	}
+	if (wqe->bytes < wqe->processed + rctx->fpdu_part_rem) {
+		dprint(DBG_RX|DBG_ON, " Receive space short: %d < %d\n",
+			wqe->bytes - wqe->processed, rctx->fpdu_part_rem);
+		wqe->wc_status = IB_WC_LOC_LEN_ERR;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+
+/*
+ * siw_srq_fetch_wqe()
+ *
+ * Get one RQ wqe from SRQ and inform user
+ * if SRQ lower watermark reached
+ */
+static inline struct siw_wqe *siw_srq_fetch_wqe(struct siw_srq *srq)
+{
+	struct siw_wqe *wqe = NULL;
+	int qlen;
+
+	lock_srq(srq);
+	if (!list_empty(&srq->rq)) {
+		wqe = list_first_wqe(&srq->rq);
+		list_del_init(&wqe->list);
+		/*
+		 * The SRQ wqe is counted for SRQ space until completed.
+		 */
+		qlen = srq->max_wr - (atomic_read(&srq->space) + 1);
+		if (srq->armed && qlen < srq->limit) {
+			srq->armed = 0;
+			dprint(DBG_RX, " SRQ(%p): SRQ limit event\n", srq);
+			siw_async_srq_ev(srq, IB_EVENT_SRQ_LIMIT_REACHED);
+		}
+	}
+	unlock_srq(srq);
+
+	return wqe;
+}
+
+static inline struct siw_wqe *siw_get_rqe(struct siw_qp *qp)
+{
+	struct siw_wqe	*wqe = NULL;
+
+	if (!qp->srq) {
+		lock_rq(qp);
+		if (!list_empty(&qp->rq)) {
+			wqe = list_first_wqe(&qp->rq);
+			list_del_init(&wqe->list);
+			unlock_rq(qp);
+		} else {
+			unlock_rq(qp);
+			dprint(DBG_RX, " QP(%d): RQ empty!\n", QP_ID(qp));
+		}
+	} else {
+		wqe = siw_srq_fetch_wqe(qp->srq);
+		if (wqe) {
+			siw_qp_get(qp);
+			wqe->qp = qp;
+		} else
+			dprint(DBG_RX, " QP(%d): SRQ empty!\n", QP_ID(qp));
+	}
+	return wqe;
+}
+
+
+/*
+ * siw_proc_send:
+ *
+ * Process one incoming SEND and place data into memory referenced by
+ * receive wqe.
+ *
+ * Function supports partially received sends (suspending/resuming
+ * current receive wqe processing)
+ *
+ * return value:
+ *	0:       reached the end of a DDP segment
+ *	-EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_send(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	struct siw_sge	*sge;
+	struct siw_mr	*mr;
+	u32		data_bytes,	/* all data bytes available */
+			rcvd_bytes;	/* sum of data bytes rcvd */
+	int		rv = 0;
+
+	if (rctx->first_ddp_seg) {
+		WARN_ON(rx_wqe(qp) != NULL);
+
+		wqe = siw_get_rqe(qp);
+		if (!wqe)
+			return -ENOENT;
+
+		rx_wqe(qp) = wqe;
+		wqe->wr_status = SR_WR_INPROGRESS;
+	} else  {
+		wqe = rx_wqe(qp);
+		if (!wqe) {
+			/*
+			 * this is a siw bug!
+			 */
+			dprint(DBG_ON, "QP(%d): RQ failure\n", QP_ID(qp));
+			return -EPROTO;
+		}
+	}
+	if (rctx->state == SIW_GET_DATA_START) {
+		rv = siw_send_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+		if (!rctx->fpdu_part_rem) /* zero length SEND */
+			return 0;
+	}
+	data_bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+	rcvd_bytes = 0;
+
+	while (data_bytes) {
+		struct siw_pd	*pd;
+		u32	sge_bytes;	/* data bytes avail for SGE */
+
+		sge = &wqe->wr.sgl.sge[rctx->sge_idx];
+
+		if (!sge->len) {
+			/* just skip empty sge's */
+			rctx->sge_idx++;
+			rctx->sge_off = 0;
+			continue;
+		}
+		sge_bytes = min(data_bytes, sge->len - rctx->sge_off);
+
+		/*
+		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
+		 */
+		pd = qp->srq == NULL ? qp->pd : qp->srq->pd;
+
+		rv = siw_check_sge(pd, sge, SR_MEM_LWRITE, rctx->sge_off,
+				   sge_bytes);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+			break;
+		}
+		mr = siw_mem2mr(sge->mem.obj);
+
+		if (mr->umem) {
+			/*
+			 * Are we going to finish placing
+			 * - the last fragment of the current SGE or
+			 * - the last DDP segment (L=1) of the current
+			 *   RDMAP message?
+			 *
+			 * siw_rx_umem() must advance umem page_chunk position
+			 * after sucessful receive only, if receive into
+			 * current umem does not end.
+			 * umem ends, if:
+			 *   - current SGE gets completely filled, OR
+			 *   - current MPA FPDU is last AND gets consumed now
+			 */
+			int umem_ends =
+				((sge_bytes + rctx->sge_off == sge->len) ||
+				  (!rctx->more_ddp_segs &&
+				   rctx->fpdu_part_rcvd + sge_bytes ==
+				   rctx->fpdu_part_rem)) ? 1 : 0;
+
+			if (rctx->sge_off == 0) {
+				/*
+				 * started a new sge: update receive pointers
+				 */
+				rv = siw_rx_umem_init(rctx, mr, sge->addr);
+				if (rv)
+					break;
+			}
+			rv = siw_rx_umem(rctx, sge_bytes, umem_ends);
+		} else
+			rv = siw_rx_kva(rctx, sge_bytes,
+					(void *)(sge->addr + rctx->sge_off));
+		if (rv != sge_bytes) {
+			wqe->processed += rcvd_bytes;
+			return -EINVAL;
+		}
+		rctx->sge_off += rv;
+
+		if (rctx->sge_off == sge->len) {
+			rctx->sge_idx++;
+			rctx->sge_off = 0;
+		}
+		data_bytes -= rv;
+		rcvd_bytes += rv;
+
+		rctx->fpdu_part_rem -= rv;
+		rctx->fpdu_part_rcvd += rv;
+	}
+	wqe->processed += rcvd_bytes;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * siw_proc_write:
+ *
+ * Place incoming WRITE after referencing and checking target buffer
+
+ * Function supports partially received WRITEs (suspending/resuming
+ * current receive processing)
+ *
+ * return value:
+ *	0:       reached the end of a DDP segment
+ *	-EAGAIN: to be called again to finish the DDP segment
+ */
+
+int siw_proc_write(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_dev		*dev = qp->hdr.dev;
+	struct iwarp_rdma_write	*write = &rctx->hdr.rwrite;
+	struct siw_mem		*mem;
+	struct siw_mr		*mr;
+	int			bytes,
+				rv;
+
+	if (rctx->state == SIW_GET_DATA_START) {
+
+		if (!rctx->fpdu_part_rem) /* zero length WRITE */
+			return 0;
+
+		rv = siw_write_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+	}
+	bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+
+	/*
+	 * NOTE: bytes > 0 is always true, since this routine
+	 * gets only called if so.
+	 */
+	if (rctx->first_ddp_seg) {
+		/* DEBUG Code, to be removed */
+		if (rx_mem(qp) != NULL) {
+			dprint(DBG_RX|DBG_ON, "(QP%d): Stale rctx state!\n",
+				QP_ID(qp));
+			return -EFAULT;
+		}
+		rx_mem(qp) = siw_mem_id2obj(dev, rctx->ddp_stag >> 8);
+	}
+	if (rx_mem(qp) == NULL) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): "
+			"Sink STag not found or invalid,  STag=0x%08x\n",
+			QP_ID(qp), rctx->ddp_stag);
+		return -EINVAL;
+	}
+	mem = rx_mem(qp);
+	/*
+	 * Rtag not checked against mem's tag again because
+	 * hdr check guarantees same tag as before if fragmented
+	 */
+	rv = siw_check_mem(qp->pd, mem, write->sink_to + rctx->fpdu_part_rcvd,
+			   SR_MEM_RWRITE, bytes);
+	if (rv) {
+		siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+		return rv;
+	}
+	mr = siw_mem2mr(mem);
+
+	if (mr->umem) {
+		/*
+		 * Are we going to place the last piece of the last
+		 * DDP segment of the current RDMAP message?
+		 *
+		 * It is last if:
+		 * - rctx->fpdu_part_rem <= rctx->skb_new AND
+		 * - payload_rem (of current DDP segment) <= rctx->skb_new
+		 */
+		int last_write = ((rctx->fpdu_part_rem <= rctx->skb_new) &&
+				   !rctx->more_ddp_segs) ? 1 : 0;
+
+		if (rctx->first_ddp_seg) {
+			rv = siw_rx_umem_init(rctx, mr, write->sink_to);
+			if (rv)
+				return -EINVAL;
+
+		}
+		rv = siw_rx_umem(rctx, bytes, last_write);
+	} else
+		rv = siw_rx_kva(rctx, bytes,
+			       (void *)(write->sink_to +
+					rctx->fpdu_part_rcvd));
+
+	if (rv != bytes)
+		return -EINVAL;
+
+	rctx->fpdu_part_rem -= rv;
+	rctx->fpdu_part_rcvd += rv;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * inbound RREQ's cannot carry user data.
+ */
+int siw_proc_rreq(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	dprint(DBG_ON|DBG_RX, "(QP%d): RREQ with MPA len %d\n", QP_ID(qp),
+		rctx->hdr.ctrl.mpa_len);
+
+	return -EPROTO;
+}
+
+static inline struct siw_wqe *siw_get_irqe(struct siw_qp *qp)
+{
+	struct siw_wqe *wqe = NULL;
+
+	if (atomic_dec_return(&qp->irq_space) >= 0) {
+		wqe = siw_freeq_wqe_get(qp);
+		if (wqe) {
+			INIT_LIST_HEAD(&wqe->list);
+			wqe->processed = 0;
+			siw_qp_get(qp);
+			wqe->qp = qp;
+			wr_type(wqe) = SIW_WR_RDMA_READ_RESP;
+		} else
+			atomic_inc(&qp->irq_space);
+	} else
+		atomic_inc(&qp->irq_space);
+
+	return wqe;
+}
+
+/*
+ * siw_init_rresp:
+ *
+ * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
+ * Put it at the tail of the IRQ, if there is another WQE currently in
+ * transmit processing. If not, make it the current WQE to be processed
+ * and schedule transmit processing.
+ *
+ * Can be called from softirq context and from process
+ * context (RREAD socket loopback case!)
+ *
+ * return value:
+ *	0:      success,
+ *		failure code otherwise
+ */
+
+int siw_init_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe *rsp;
+
+	rsp = siw_get_irqe(qp);
+	if (rsp) {
+		rsp->wr.rresp.sge.len = be32_to_cpu(rctx->hdr.rreq.read_size);
+		rsp->bytes = rsp->wr.rresp.sge.len;	/* redundant */
+
+		rsp->wr.rresp.sge.addr = be64_to_cpu(rctx->hdr.rreq.source_to);
+		rsp->wr.rresp.num_sge = rsp->bytes ? 1 : 0;
+
+		rsp->wr.rresp.sge.mem.obj = NULL;	/* defer lookup */
+		rsp->wr.rresp.sge.lkey =
+			be32_to_cpu(rctx->hdr.rreq.source_stag);
+
+		rsp->wr.rresp.raddr = be64_to_cpu(rctx->hdr.rreq.sink_to);
+		rsp->wr.rresp.rtag = rctx->hdr.rreq.sink_stag; /* NBO */
+
+	} else {
+		dprint(DBG_RX|DBG_ON, "(QP%d): IRD exceeded!\n", QP_ID(qp));
+		return -EPROTO;
+	}
+	rsp->wr_status = SR_WR_QUEUED;
+
+	/*
+	 * Insert into IRQ
+	 *
+	 * TODO: Revisit ordering of genuine SQ WRs and Read Response
+	 * pseudo-WRs. RDMAP specifies that there is no ordering among
+	 * the two directions of transmission, so there is a degree of
+	 * freedom.
+	 *
+	 * The current logic favours Read Responses over SQ work requests
+	 * that are queued but not already in progress.
+	 */
+	lock_sq(qp);
+	if (!tx_wqe(qp)) {
+		tx_wqe(qp) = rsp;
+		unlock_sq(qp);
+		/*
+		 * schedule TX work, even if SQ was supended due to
+		 * ORD limit: it is always OK (and may even prevent peers
+		 * from appl lock) to send RRESPONSE's
+		 */
+		siw_sq_queue_work(qp);
+	} else {
+		list_add_tail(&rsp->list, &qp->irq);
+		unlock_sq(qp);
+	}
+	return 0;
+}
+
+/*
+ * siw_proc_rresp:
+ *
+ * Place incoming RRESP data into memory referenced by RREQ WQE.
+ *
+ * Function supports partially received RRESP's (suspending/resuming
+ * current receive processing)
+ */
+int siw_proc_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	struct siw_mr	*mr;
+	struct siw_sge	*sge;
+	int		bytes,
+			rv;
+
+	if (rctx->first_ddp_seg) {
+		WARN_ON(rx_wqe(qp) != NULL);
+		/*
+		 * fetch pending RREQ from orq
+		 */
+		lock_orq(qp);
+		if (!list_empty(&qp->orq)) {
+			wqe = list_first_entry(&qp->orq, struct siw_wqe, list);
+			list_del_init(&wqe->list);
+		} else {
+			unlock_orq(qp);
+			dprint(DBG_RX|DBG_ON, "(QP%d): ORQ empty\n",
+				QP_ID(qp));
+			/*
+			 * TODO: Should generate an async error
+			 */
+			rv = -ENODATA; /* or -ENOENT ? */
+			goto done;
+		}
+		unlock_orq(qp);
+
+		rx_wqe(qp) = wqe;
+
+		if (wr_type(wqe) != SIW_WR_RDMA_READ_REQ || wqe->processed) {
+			WARN_ON(wqe->processed);
+			WARN_ON(wr_type(wqe) != SIW_WR_RDMA_READ_REQ);
+			rv = -EINVAL;
+			goto done;
+		}
+
+		wqe->wr_status = SR_WR_INPROGRESS;
+
+		rv = siw_rresp_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			goto done;
+		}
+	} else {
+		wqe = rx_wqe(qp);
+		if (!wqe) {
+			WARN_ON(1);
+			rv = -ENODATA;
+			goto done;
+		}
+	}
+	if (!rctx->fpdu_part_rem) /* zero length RRESPONSE */
+		return 0;
+
+	bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+	sge = wqe->wr.rread.sge; /* there is only one */
+
+	/*
+	 * check target memory which resolves memory on first fragment
+	 */
+	rv = siw_check_sge(qp->pd, sge, SR_MEM_LWRITE, wqe->processed, bytes);
+	if (rv) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): siw_check_sge failed: %d\n",
+			QP_ID(qp), rv);
+		wqe->wc_status = IB_WC_LOC_PROT_ERR;
+		siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+		goto done;
+	}
+	mr = siw_mem2mr(sge->mem.obj);
+
+	if (mr->umem) {
+		/*
+		 * Are we going to finish placing the last DDP segment (L=1)
+		 * of the current RDMAP message?
+		 *
+		 * NOTE: siw_rresp_check_ntoh() guarantees that the
+		 * last inbound RDMAP Read Response message exactly matches
+		 * with the RREQ WR.
+		 */
+		int is_last = (bytes + wqe->processed == wqe->bytes) ? 1 : 0;
+
+		if (rctx->first_ddp_seg) {
+			rv = siw_rx_umem_init(rctx, mr, sge->addr);
+			if (rv) {
+				wqe->wc_status = IB_WC_LOC_PROT_ERR;
+				goto done;
+			}
+		}
+		rv = siw_rx_umem(rctx,  bytes, is_last);
+	} else
+		rv = siw_rx_kva(rctx,  bytes,
+				(void *)(sge->addr + wqe->processed));
+	if (rv != bytes) {
+		wqe->wc_status = IB_WC_GENERAL_ERR;
+		rv = -EINVAL;
+		goto done;
+	}
+	rctx->fpdu_part_rem -= rv;
+	rctx->fpdu_part_rcvd += rv;
+
+	wqe->processed += rv;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+done:
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+static void siw_drain_pkt(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	char	buf[128];
+	int	len;
+
+	dprint(DBG_ON|DBG_RX, " (QP%d): drain %d bytes\n",
+		QP_ID(qp), rctx->fpdu_part_rem);
+
+	while (rctx->fpdu_part_rem) {
+		len = min(rctx->fpdu_part_rem, 128);
+
+		skb_copy_bits(rctx->skb, rctx->skb_offset,
+				      buf, rctx->fpdu_part_rem);
+
+		rctx->skb_copied += len;
+		rctx->skb_offset += len;
+		rctx->skb_new -= len;
+		rctx->fpdu_part_rem -= len;
+	}
+}
+
+int siw_proc_unsupp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	WARN_ON(1);
+	siw_drain_pkt(qp, rctx);
+	return 0;
+}
+
+
+int siw_proc_terminate(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_terminate	*term = &rctx->hdr.terminate;
+
+	printk(KERN_INFO "(QP%d): RX Terminate: type=%d, layer=%d, code=%d\n",
+		QP_ID(qp), term->term_ctrl.etype, term->term_ctrl.layer,
+		term->term_ctrl.ecode);
+
+	siw_drain_pkt(qp, rctx);
+	return 0;
+}
+
+
+static int siw_get_trailer(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct sk_buff	*skb = rctx->skb;
+	u8		*tbuf = (u8 *)&rctx->trailer.crc - rctx->pad;
+	int		avail;
+
+	avail = min(rctx->skb_new, rctx->fpdu_part_rem);
+
+	skb_copy_bits(skb, rctx->skb_offset,
+		      tbuf + rctx->fpdu_part_rcvd, avail);
+
+	rctx->fpdu_part_rcvd += avail;
+	rctx->fpdu_part_rem -= avail;
+
+	rctx->skb_new -= avail;
+	rctx->skb_offset += avail;
+	rctx->skb_copied += avail;
+
+	dprint(DBG_RX, " (QP%d): %d remaining (%d)\n", QP_ID(qp),
+		rctx->fpdu_part_rem, avail);
+
+	if (!rctx->fpdu_part_rem) {
+		u32	crc_in, crc_own = 0;
+		/*
+		 * check crc if required
+		 */
+		if (!rctx->crc_enabled)
+			return 0;
+
+		if (rctx->pad && siw_crc_array(&rctx->mpa_crc_hd,
+					       tbuf, rctx->pad) != 0)
+			return -EINVAL;
+
+		crypto_hash_final(&rctx->mpa_crc_hd, (u8 *)&crc_own);
+
+		/*
+		 * CRC32 is computed, transmitted and received directly in NBO,
+		 * so there's never a reason to convert byte order.
+		 */
+		crc_in = rctx->trailer.crc;
+
+		if (crc_in != crc_own) {
+			dprint(DBG_RX|DBG_ON,
+				" (QP%d): CRC ERROR in:=%08x, own=%08x\n",
+				QP_ID(qp), crc_in, crc_own);
+			return -EINVAL;
+		}
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+
+static int siw_get_hdr(struct siw_iwarp_rx *rctx)
+{
+	struct sk_buff		*skb = rctx->skb;
+	struct iwarp_ctrl	*c_hdr = &rctx->hdr.ctrl;
+
+	int bytes;
+
+	if (rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) {
+		/*
+		 * copy first fix part of iwarp hdr
+		 */
+		bytes = min_t(int, rctx->skb_new, sizeof(struct iwarp_ctrl)
+				- rctx->fpdu_part_rcvd);
+
+		skb_copy_bits(skb, rctx->skb_offset,
+			      (char *)c_hdr + rctx->fpdu_part_rcvd, bytes);
+
+		rctx->fpdu_part_rcvd += bytes;
+
+		rctx->skb_new -= bytes;
+		rctx->skb_offset += bytes;
+		rctx->skb_copied += bytes;
+
+		if (!rctx->skb_new ||
+			rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl))
+			return -EAGAIN;
+
+		if (c_hdr->opcode > RDMAP_TERMINATE) {
+			dprint(DBG_RX|DBG_ON, " opcode %d\n", c_hdr->opcode);
+			return -EINVAL;
+		}
+		if (c_hdr->dv != DDP_VERSION) {
+			dprint(DBG_RX|DBG_ON, " dversion %d\n", c_hdr->dv);
+			return -EINVAL;
+		}
+		if (c_hdr->rv != RDMAP_VERSION) {
+			dprint(DBG_RX|DBG_ON, " rversion %d\n", c_hdr->rv);
+			return -EINVAL;
+		}
+		dprint(DBG_RX, "(QP%d): New Header, opcode:%d\n",
+			RX_QPID(rctx), c_hdr->opcode);
+	}
+	/*
+	 * figure out len of current hdr: variable length of
+	 * iwarp hdr forces us to copy hdr information
+	 */
+	bytes = min(rctx->skb_new,
+		  iwarp_pktinfo[c_hdr->opcode].hdr_len - rctx->fpdu_part_rcvd);
+
+	skb_copy_bits(skb, rctx->skb_offset,
+		      (char *)c_hdr + rctx->fpdu_part_rcvd, bytes);
+
+	rctx->fpdu_part_rcvd += bytes;
+
+	rctx->skb_new -= bytes;
+	rctx->skb_offset += bytes;
+	rctx->skb_copied += bytes;
+
+	if (rctx->fpdu_part_rcvd == iwarp_pktinfo[c_hdr->opcode].hdr_len) {
+		/*
+		 * HDR receive completed. Check if the current DDP segment
+		 * starts a new RDMAP message or continues a previously
+		 * started RDMAP message.
+		 *
+		 * Note well from the comments on DDP reassembly:
+		 * - Support for unordered reception of DDP segments
+		 *   (or FPDUs) from different RDMAP messages is not needed.
+		 * - Unordered reception of DDP segments of the same
+		 *   RDMAP message is not supported. It is probably not
+		 *   needed with most peers.
+		 */
+		siw_dprint_hdr(&rctx->hdr, RX_QPID(rctx), "HDR received");
+
+		if (rctx->more_ddp_segs != 0) {
+			rctx->first_ddp_seg = 0;
+			if (rctx->prev_ddp_opcode != c_hdr->opcode) {
+				dprint(DBG_ON,
+					"packet intersection: %d <> %d\n",
+					rctx->prev_ddp_opcode, c_hdr->opcode);
+				return -EPROTO;
+			}
+		} else {
+			rctx->prev_ddp_opcode = c_hdr->opcode;
+			rctx->first_ddp_seg = 1;
+		}
+		rctx->more_ddp_segs = (c_hdr->l == 0) ? 1 : 0;
+
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+static inline int siw_fpdu_payload_len(struct siw_iwarp_rx *rctx)
+{
+	return ((int)(rctx->hdr.ctrl.mpa_len) - rctx->fpdu_part_rcvd)
+		+ MPA_HDR_SIZE;
+}
+
+static inline int siw_fpdu_trailer_len(struct siw_iwarp_rx *rctx)
+{
+	int mpa_len = (int)rctx->hdr.ctrl.mpa_len + MPA_HDR_SIZE;
+
+	return MPA_CRC_SIZE + (-mpa_len & 0x3);
+}
+
+/*
+ * siw_rreq_complete()
+ *
+ * Complete the current READ REQUEST after READ RESPONSE processing.
+ * It may complete consecutive WQE's which were already SQ
+ * processed before but are awaiting completion due to completion
+ * ordering (see verbs 8.2.2.2).
+ * The READ RESPONSE may also resume SQ processing if it was stalled
+ * due to ORD exhaustion (see verbs 8.2.2.18)
+ * Function stops completion when next READ REQUEST found or ORQ empty.
+ */
+static void siw_rreq_complete(struct siw_wqe *wqe, int error)
+{
+	struct siw_qp		*qp = wqe->qp;
+	int			num_wc = 1;
+	enum ib_send_flags	flags;
+	LIST_HEAD(c_list);
+
+	flags = wr_flags(wqe);
+
+	if (flags & IB_SEND_SIGNALED)
+		list_add(&wqe->list, &c_list);
+	else {
+		atomic_inc(&qp->sq_space);
+		siw_wqe_put(wqe);
+		num_wc = 0;
+	}
+
+	lock_orq(qp);
+
+	/* More WQE's to complete following this RREQ? */
+	if (!list_empty(&qp->orq)) {
+		struct list_head *pos, *n;
+		list_for_each_safe(pos, n, &qp->orq) {
+			wqe = list_entry_wqe(pos);
+			if (wr_type(wqe) == SIW_WR_RDMA_READ_REQ)
+				break;
+			flags |= wr_flags(wqe);
+			num_wc++;
+			dprint(DBG_WR|DBG_ON,
+				"(QP%d): Resume completion, wr_type %d\n",
+				QP_ID(qp), wr_type(wqe));
+			list_move_tail(pos, &c_list);
+		}
+	}
+	unlock_orq(qp);
+
+	if (num_wc)
+		siw_sq_complete(&c_list, qp, num_wc, flags);
+
+	/*
+	 * Check if SQ processing was stalled due to ORD limit
+	 */
+	if (ORD_SUSPEND_SQ(qp)) {
+		lock_sq(qp);
+
+		wqe = siw_next_tx_wqe(qp);
+
+		if (wqe && !tx_wqe(qp)) {
+			WARN_ON(wr_type(wqe) != SIW_WR_RDMA_READ_REQ);
+			list_del_init(&wqe->list);
+			tx_wqe(qp) = wqe;
+
+			list_add_tail(&wqe->list, &qp->orq);
+
+			unlock_sq(qp);
+
+			dprint(DBG_RX, "(QP%d): SQ resume (%d)\n",
+				QP_ID(qp), atomic_read(&qp->sq_space));
+
+			siw_sq_queue_work(qp);
+		} else {
+			/* only new ORQ space if not next RREQ queued */
+			atomic_inc(&qp->orq_space);
+			unlock_sq(qp);
+		}
+	} else
+		atomic_inc(&qp->orq_space);
+}
+
+/*
+ * siw_rdmap_complete()
+ *
+ * complete processing of an RDMA message after receiving all
+ * DDP segmens
+ *
+ *   o SENDs + RRESPs will need for completion,
+ *   o RREQs need for  READ RESPONSE initialization
+ *   o WRITEs need memory dereferencing
+ *
+ * TODO: Could siw_[s,r]_complete() fail? (CQ full)
+ */
+static inline int siw_rdmap_complete(struct siw_qp *qp,
+				     struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	int rv = 0;
+
+	switch (rctx->hdr.ctrl.opcode) {
+
+	case RDMAP_SEND_SE:
+		wr_flags(rx_wqe(qp)) |= IB_SEND_SOLICITED;
+	case RDMAP_SEND:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+		wqe = rx_wqe(qp);
+
+		wqe->wc_status = IB_WC_SUCCESS;
+		wqe->wr_status = SR_WR_DONE;
+
+		siw_rq_complete(wqe, qp);
+
+		break;
+
+	case RDMAP_RDMA_READ_RESP:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+
+		wqe = rx_wqe(qp);
+
+		wqe->wc_status = IB_WC_SUCCESS;
+		wqe->wr_status = SR_WR_DONE;
+
+		siw_rreq_complete(wqe, 0);
+
+		break;
+
+	case RDMAP_RDMA_READ_REQ:
+		rv = siw_init_rresp(qp, rctx);
+
+		break;
+
+	case RDMAP_RDMA_WRITE:
+		/*
+		 * Free References from memory object if
+		 * attached to receive context (inbound WRITE)
+		 * While a zero-length WRITE is allowed, the
+		 * current implementation does not create
+		 * a memory reference (it is unclear if memory
+		 * rights should be checked in that case!).
+		 *
+		 * TODO: check zero length WRITE semantics
+		 */
+		if (rx_mem(qp))
+			siw_mem_put(rx_mem(qp));
+		break;
+
+	default:
+		break;
+
+	}
+	rctx->umem_chunk = NULL; /* DEBUG aid, tentatively */
+	rx_wqe(qp) = NULL;	/* also clears MEM object for WRITE */
+
+	return rv;
+}
+
+/*
+ * siw_rdmap_error()
+ *
+ * Abort processing of RDMAP message after failure.
+ * SENDs + RRESPs will need for receive completion, if
+ * already started.
+ *
+ * TODO: WRITE need local error to be surfaced.
+ *
+ */
+static inline void
+siw_rdmap_error(struct siw_qp *qp, struct siw_iwarp_rx *rctx, int status)
+{
+	struct siw_wqe	*wqe;
+
+	switch (rctx->hdr.ctrl.opcode) {
+
+	case RDMAP_SEND_SE:
+	case RDMAP_SEND:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+		wqe = rx_wqe(qp);
+		if (!wqe)
+			return;
+
+		if (rctx->hdr.ctrl.opcode == RDMAP_SEND_SE)
+			wr_flags(wqe) |= IB_SEND_SOLICITED;
+
+		if (!wqe->wc_status)
+			wqe->wc_status = IB_WC_GENERAL_ERR;
+
+		wqe->wr_status = SR_WR_DONE;
+		siw_rq_complete(wqe, qp);
+
+		break;
+
+	case RDMAP_RDMA_READ_RESP:
+		/*
+		 * A READ RESPONSE may flush consecutive WQE's
+		 * which were SQ processed before
+		 */
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+
+		if (rctx->state == SIW_GET_HDR || status == -ENODATA)
+			/*  eventual RREQ left untouched */
+			break;
+
+		wqe = rx_wqe(qp);
+		if (wqe) {
+			if (status)
+				wqe->wc_status = status;
+			else
+				wqe->wc_status = IB_WC_GENERAL_ERR;
+
+			wqe->wr_status = SR_WR_DONE;
+			/*
+			 * All errors turn the wqe into signalled.
+			 */
+			wr_flags(wqe) |= IB_SEND_SIGNALED;
+			siw_rreq_complete(wqe, status);
+		}
+		break;
+
+	case RDMAP_RDMA_WRITE:
+		/*
+		 * Free References from memory object if
+		 * attached to receive context (inbound WRITE)
+		 * While a zero-length WRITE is allowed, the
+		 * current implementation does not create
+		 * a memory reference (it is unclear if memory
+		 * rights should be checked in that case!).
+		 *
+		 * TODO: check zero length WRITE semantics
+		 */
+		if (rx_mem(qp))
+			siw_mem_put(rx_mem(qp));
+		break;
+
+	default:
+		break;
+	}
+	rctx->umem_chunk = NULL; /* DEBUG aid, tentatively */
+	rx_wqe(qp) = NULL;	/* also clears MEM object for WRITE */
+}
+
+/*
+ * siw_tcp_rx_data()
+ *
+ * Main routine to consume inbound TCP payload
+ *
+ * @rd_desc:	read descriptor
+ * @skb:	socket buffer
+ * @off:	offset in skb
+ * @len:	skb->len - offset : payload in skb
+ */
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+		    unsigned int off, size_t len)
+{
+	struct siw_qp		*qp = rd_desc->arg.data;
+	struct siw_iwarp_rx	*rctx = &qp->rx_ctx;
+	int			rv;
+
+	rctx->skb = skb;
+	rctx->skb_new = skb->len - off;
+	rctx->skb_offset = off;
+	rctx->skb_copied = 0;
+
+	dprint(DBG_RX, "(QP%d): new data %d, rx-state %d\n", QP_ID(qp),
+		rctx->skb_new, rctx->state);
+
+	if (unlikely(rctx->rx_suspend == 1 ||
+		     qp->attrs.state != SIW_QP_STATE_RTS)) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): failed. state rx:%d, qp:%d\n",
+			QP_ID(qp), qp->rx_ctx.state, qp->attrs.state);
+		return 0;
+	}
+	while (rctx->skb_new) {
+
+		switch (rctx->state) {
+
+		case SIW_GET_HDR:
+			rv = siw_get_hdr(rctx);
+			if (!rv) {
+				if (rctx->crc_enabled &&
+				    siw_crc_rxhdr(rctx) != 0) {
+					rv = -EINVAL;
+					break;
+				}
+				rctx->hdr.ctrl.mpa_len =
+					ntohs(rctx->hdr.ctrl.mpa_len);
+
+				rctx->fpdu_part_rem =
+					siw_fpdu_payload_len(rctx);
+
+				if (rctx->fpdu_part_rem)
+					rctx->pad = -rctx->fpdu_part_rem & 0x3;
+				else
+					rctx->pad = 0;
+
+				rctx->state = SIW_GET_DATA_START;
+				rctx->fpdu_part_rcvd = 0;
+			}
+			break;
+
+		case SIW_GET_DATA_MORE:
+			/*
+			 * Another data fragment of the same DDP segment.
+			 * Headers will not be checked again by the
+			 * opcode-specific data receive function below.
+			 * Setting first_ddp_seg = 0 avoids repeating
+			 * initializations that may occur only once per
+			 * DDP segment.
+			 */
+			rctx->first_ddp_seg = 0;
+
+		case SIW_GET_DATA_START:
+			/*
+			 * Headers will be checked by the opcode-specific
+			 * data receive function below.
+			 */
+			rv = siw_rx_data(qp, rctx);
+			if (!rv) {
+				rctx->fpdu_part_rem =
+					siw_fpdu_trailer_len(rctx);
+				rctx->fpdu_part_rcvd = 0;
+				rctx->state = SIW_GET_TRAILER;
+			} else
+				rctx->state = SIW_GET_DATA_MORE;
+
+			break;
+
+		case SIW_GET_TRAILER:
+			/*
+			 * read CRC + any padding
+			 */
+			rv = siw_get_trailer(qp, rctx);
+			if (!rv) {
+				/*
+				 * FPDU completed.
+				 * complete RDMAP message if last fragment
+				 */
+				rctx->state = SIW_GET_HDR;
+				rctx->fpdu_part_rcvd = 0;
+
+				if (!rctx->hdr.ctrl.l)
+					/* more frags */
+					break;
+
+				rv = siw_rdmap_complete(qp, rctx);
+				if (rv)
+					break;
+			}
+			break;
+
+		default:
+			WARN_ON(1);
+			rv = -EAGAIN;
+		}
+
+		if (unlikely(rv != 0 && rv != -EAGAIN)) {
+			/*
+			 * TODO: implement graceful error handling including
+			 *       generation (and processing) of TERMINATE
+			 *       messages.
+			 *
+			 *	 for now we are left with a bogus rx status
+			 *	 unable to receive any further byte.
+			 *	 BUT: code must handle difference between
+			 *
+			 *	 o protocol syntax (FATAL, framing lost)
+			 *	 o crc	(FATAL, framing lost since we do not
+			 *	        trust packet header (??))
+			 *	 o local resource (maybe non fatal, framing
+			 *	   not lost)
+			 *
+			 *	 errors.
+			 */
+			siw_rdmap_error(qp, rctx, rv);
+
+			dprint(DBG_RX|DBG_ON,
+				"(QP%d): RX ERROR %d at RX state %d\n",
+				QP_ID(qp), rv, rctx->state);
+
+			siw_dprint_rctx(rctx);
+			/*
+			 * Calling siw_cm_queue_work() is safe without
+			 * releasing qp->state_lock because the QP state
+			 * will be transitioned to SIW_QP_STATE_ERROR
+			 * by the siw_work_handler() workqueue handler
+			 * after we return from siw_qp_llp_data_ready().
+			 */
+			siw_qp_cm_drop(qp, 1);
+
+			break;
+		}
+		if (rv) {
+			dprint(DBG_RX, "(QP%d): "
+				"Misaligned FPDU: State: %d, missing: %d\n",
+				QP_ID(qp), rctx->state, rctx->fpdu_part_rem);
+			break;
+		}
+	}
+	return rctx->skb_copied;
+}
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ